ScoreStats.java example

Explorer

damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java

/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.util;

import java.io.*;
import java.util.*;

import net.nutch.db.*;

/*************************************************************
 * When we generate a fetchlist, we need to choose a "cutoff"
 * score, such that any scores above that cutoff will be included
 * in the fetchlist.  Any scores below will not be.  (It is too
 * hard to do the obvious thing, which is to sort the list of all
 * pages by score, and pick the top K.)
 *
 * We need a good way to choose that cutoff.  ScoreStats is used 
 * during LinkAnalysis to track the distribution of scores that
 * we compute.  We bucketize the scorespace into 2000 buckets.
 * the first 1000 are equally-spaced counts for the range 0..1.0
 * (non-inclusive).  The 2nd buckets are logarithmically spaced
 * between 1 and Float.MAX_VALUE.
 *
 * If the score is < 1, then choose a bucket by (score / 1000) and
 * choosing the incrementing the resulting slot.
 *
 * If the score is >1, then take the base-10 log, and take the
 * integer floor.  This should be an int no greater than 9.  This
 * is the hundreds-place digit for the index.  (Since '1' is in
 * the thousands-place.)  Next, find where the score appears in
 * the range between floor(log(score)), and ceiling(log(score)).
 * The percentage of the distance between these two values is
 * reflected in the final two digits for the index.
 *
 * @author Mike Cafarella
 ***************************************************************/
public class ScoreStats {
    private final static double INVERTED_LOG_BASE_TEN = (1.0 / Math.log(10));
    private final static double EXP_127_MODIFIER = (1000.0 / (Math.log(Float.MAX_VALUE)  * INVERTED_LOG_BASE_TEN));

    private final static double RANGE_COMPRESSOR = INVERTED_LOG_BASE_TEN * EXP_127_MODIFIER;
    long totalScores = 0;

    //
    // For bucketizing score counts
    //
    long buckets[] = new long[2001];

    /**
     */
    public ScoreStats() {
    }

    /**
     * Increment the counter in the right place.  We keep
     * 2000 different buckets.  Half of them are <1, and
     * half are >1.
     
     * Dies when it tries to fill bucket "1132"
     */
    public void addScore(float score) {
        if (score < 1) {
            int index = (int) Math.floor(score * 1000);
            buckets[index]++;
        } else {
            // Here we need to find the floor'ed base-10 logarithm.
            int index = (int) Math.floor(Math.log(score) * RANGE_COMPRESSOR);
            index += 1000;
            buckets[index]++;
        }
        totalScores++;
    }

    /**
     * Print out the distribution, with greater specificity
     * for percentiles 90th - 100th.
     */
    public void emitDistribution(PrintStream pout) {
        pout.println("***** Estimated Score Distribution *****");
        pout.println("  (to choose a fetchlist cutoff score)");
        pout.println();

        // Figure out how big each percentile chunk is.
        double decileChunk = totalScores / 10.0;
        double percentileChunk = totalScores / 100.0;

        // Now, emit everything
        double grandTotal = 0, minScore = Double.MAX_VALUE, maxScore = Double.MIN_VALUE;
        long scoresSoFar = 0;
        int decileCount = 0, percentileCount = 0;

        // Go through all the sample buckets
        for (int i = 0; i < buckets.length; i++) {
            //
            // Always increment the
            // seen-sample counter by the number of samples
            // in the current bucket.
            //
            scoresSoFar += buckets[i];

            // From the bucket index, recreate the
            // original score (as best we can)
            double reconstructedValue = 0.0;
            if (i < 1000) {
                reconstructedValue = i / 1000.0;
            } else {
                int localIndex = i - 1000;
                reconstructedValue = Math.exp(localIndex / RANGE_COMPRESSOR);
            }

            // Keep running stats on min, max, avg scores
            grandTotal += (reconstructedValue * buckets[i]);
            if (buckets[i] > 0) {
                if (minScore > reconstructedValue) {
                    minScore = reconstructedValue;
                }
                if (maxScore < reconstructedValue) {
                    maxScore = reconstructedValue;
                }
            }

            //
            // If the number of samples we've seen so far is
            // GTE the predicted percentile break, then we want to
            // emit a println().
            //
            if (scoresSoFar >= ((decileCount * decileChunk) + (percentileCount * percentileChunk))) {

                // Compute what percentile of the items
                // we've reached
                double precisePercentile = ((int) Math.round(((totalScores - scoresSoFar) / (totalScores * 1.0)) * 10000)) / 100.0;

                // Emit
                String equalityOperator = ">=";
                if ((totalScores - scoresSoFar) == 0) {
                    equalityOperator = ">";
                }

                pout.println(precisePercentile + "% (" + (totalScores - scoresSoFar) + ")  have score " + equalityOperator + " " + reconstructedValue);

                // Bump our decile and percentile counters.
                // We may have to bump multiple times if
                // a single bucket carried us across several
                // boundaries.
                while (decileCount < 9 && scoresSoFar >= (decileCount * decileChunk) + (percentileCount * percentileChunk)) {
                    decileCount++;
                }
                if (decileCount >= 9) {
                    while (percentileCount < 10 && scoresSoFar >= (decileCount * decileChunk) + (percentileCount * percentileChunk)) {
                        percentileCount++;
                    }
                }

                // If we've reached the top percentile, then we're done!
                if (percentileCount >= 10) {
                    break;
                }
            }
        }

        pout.println();
        pout.println();
        pout.println("Min score is " + minScore);
        pout.println("Max score is " + maxScore);
        pout.println("Average score is " + (grandTotal / scoresSoFar));
    }

    /**
     */
    public static void main(String argv[]) throws IOException {
        if (argv.length < 1) {
            System.out.println("Usage: java net.nutch.util.ScoreStats [-real <db>] [-simulated <numScores> <min> <max> [seed]]");
            return;
        }

        File dbFile = null;
        long seed = new Random().nextLong();
        boolean simulated = false;
        int numScores = 0;
        float min = 0, max = 0;

        if ("-real".equals(argv[0])) {
            dbFile = new File(argv[1]);
        } else if ("-simulated".equals(argv[0])) {
            simulated = true;
            numScores = Integer.parseInt(argv[1]);
            min = Float.parseFloat(argv[2]);
            max = Float.parseFloat(argv[3]);
            if (argv.length > 4) {
                seed = Long.parseLong(argv[4]);
            }
        } else {
            System.out.println("No command specified");
        }

        System.out.println("Using seed: " + seed);
        ScoreStats ss = new ScoreStats();
        if (simulated) {
            Random r = new Random(seed);
            for (int i = 0; i < numScores; i++) {
                float newScore = min + (r.nextFloat() * (max - min));
                ss.addScore(newScore);
            }
        } else {
            IWebDBReader reader = new WebDBReader(dbFile);
            try {
                for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
                    Page p = (Page) e.nextElement();
                    ss.addScore(p.getScore());
                }
            } finally {
                reader.close();
            }
        }
        
        ss.emitDistribution(System.out);
    }
}